# Descriptives PRESVEN data

library(tidyverse)
library(gt)
library(readxl)
library(stringi)
library(sf)
library(gtsummary)
library(Hmisc)

ven <- readRDS("ven_elec_2006_2024_final.rds")

##### Variable table ### 

# Define custom replacements for accented characters
remove_accents <- function(text) {
  text %>%
    str_replace_all(c(
      "á" = "a", "é" = "e", "í" = "i", "ó" = "o", "ú" = "u",
      "Á" = "A", "É" = "E", "Í" = "I", "Ó" = "O", "Ú" = "U"
    ))
}


# Define total municipalities and parishes in Venezuela
total_municipalities <- 335
total_parishes <- 1136

# Define official total mesas and centros for each year
official_counts <- tibble(
  year = as.factor(c(2024, 2018, 2013, 2012, 2006)),
  total_mesas = c(30026, 34143, 39322, 38496, 32331),
  total_centros = c(15797, 14638, 13557, 14125, 11118)
)

# Compute dataset coverage
summary_counts <- ven %>%
  mutate(
    mun_edo = paste(estado, municipio, sep = "_"),
    mun_par = paste(municipio, parroquia, sep = "_"),
    nombre_centro = remove_accents(nombre_centro),
    par_centro = paste(parroquia, centro, sep = "_")
  ) %>% 
  group_by(year) %>%
  summarise(
    unique_municipios = n_distinct(mun_edo),
    unique_parroquias = n_distinct(mun_par),
    unique_centros = n_distinct(par_centro),  # Count unique voting centers in dataset
    mesas = n()
  ) %>%
  left_join(official_counts, by = "year") %>%  # Merge official counts
  mutate(
    perc_municipios = unique_municipios / total_municipalities * 100,
    perc_parroquias = unique_parroquias / total_parishes * 100,
    perc_centros = unique_centros / total_centros * 100,
    perc_mesas = mesas / total_mesas * 100
  )

# Generate GT Table
ven_summary_table <- gt(summary_counts) %>%
  cols_hide(columns = c(total_mesas,total_centros)) %>% 
  tab_header(title = "Coverage of Municipalities, Parishes, and Voting Centers in the Dataset") %>%
  cols_move_to_start(
    columns = c("year", "unique_municipios", "perc_municipios", 
                "unique_parroquias", "perc_parroquias", 
                "unique_centros", "perc_centros", "mesas", "perc_mesas")
  ) %>%
  cols_label(
    year = "Election Year",
    unique_centros = "Voting Centers (Dataset)",
    perc_centros = "% of Voting Centers",
    unique_parroquias = "Parishes (Dataset)",
    perc_parroquias = "% of Parishes",
    unique_municipios = "Municipalities (Dataset)",
    perc_municipios = "% of Municipalities",
    mesas = "Polling Stations (Dataset)",
    perc_mesas = "% of Voting Tables"
  ) %>%
  fmt_number(
    columns = c(unique_centros, unique_parroquias, unique_municipios, mesas),
    decimals = 0
  ) %>%
  fmt_number(
    columns = c(perc_municipios, perc_parroquias, perc_centros, perc_mesas),
    decimals = 1,
    suffixing = TRUE
  ) %>%
  tab_options(
    table.font.size = "medium",
    column_labels.font.weight = "bold",
    heading.align = "center"
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  tab_footnote(
    footnote = "In 2018, the dataset includes 1137 parishes due to the temporary duplication of the '5 de Julio' parish, which appeared in both Municipio Caroní (Bolívar) and Municipio Casacoima (Delta Amacuro).",
    locations = cells_body(columns = unique_parroquias, rows = 4)
  ) %>%
  tab_footnote(
    footnote = "Official registered voting centers: 2024: 15.797 | 2018: 14.638 | 2013: 13.557 | 2012: 14.125 | 2006: 11.118.",
    locations = cells_title()
  ) %>%
  tab_footnote(
    footnote = "Official registered voting tables: 2024: 30.026 | 2018: 34.143 | 2013: 39.322 | 2012: 38.496 | 2006: 32.331.",
    locations = cells_title()
  ) %>%
  tab_source_note(source_note = "Source: Consejo Nacional Electoral (CNE) and resultadosconvenezuela.com for the 2024 election.")

ven_summary_table

gtsave(ven_summary_table, filename = "ven_summary_table.docx")


summary(ven$Población)


missingpob <- ven %>% filter(is.na(Población)) %>% 
  mutate(
    mun_par = paste(Municipio, Parroquia, sep = "_")
  ) %>% 
  group_by(year) %>%
  summarise(
    unique_parroquias = n_distinct(mun_par),
  )

table(missingpob$Municipio)

missingpob_mun <- ven %>% filter(is.na(Población)) %>% 
  mutate(edo_mun = paste(estado, Municipio, sep = "_"))


missing_mun_names <- unique(missingpob_mun$edo_mun)

sort(missing_mun_names)



# Compute summary counts
census_summary_counts <- ven_mun %>%
  filter(year == 2018) %>% # I filter for 2018 because it remains constant every year
  mutate(estado = tolower(estado),
         estado = str_to_title(estado)) %>% 
  group_by(estado) %>% 
  summarise(
    total_municipios = n(),  # Total number of municipalities
    
    # Municipality size categories (inhabitants)
    municipios_less_25k = sum(mun_size_hab == "less than 25k", na.rm = TRUE),
    municipios_25k_50k = sum(mun_size_hab == "from 25k to 50k", na.rm = TRUE),
    municipios_50k_100k = sum(mun_size_hab == "from 50k to 100k", na.rm = TRUE),
    municipios_more_100k = sum(mun_size_hab == "more than 100k", na.rm = TRUE),
    
    # Urban-Rural classification (OECD)
    municipios_rural_ocde = sum(urb_level_ocde == "rural", na.rm = TRUE),
    municipios_urban_ocde = sum(urb_level_ocde == "urban", na.rm = TRUE),
    
    # Urban-Rural classification (OECD)
    municipios_city_onu = sum(urb_level_onu == "urban", na.rm = TRUE),
    municipios_town_onu = sum(urb_level_onu == "urban-town", na.rm = TRUE),
    municipios_rural_onu = sum(urb_level_onu == "rural", na.rm = TRUE),
    
    .groups = "drop"
  )

total_row <- census_summary_counts %>%
  summarise(
    estado = "Total",  # Label for total row
    across(where(is.numeric), \(x) sum(x, na.rm = TRUE))  # Sum all numeric columns
  )

census_summary_counts <- bind_rows(census_summary_counts, total_row)

ven_census_mun_summary_table <- gt(census_summary_counts) %>% 
  tab_header(title = "Municipalities by State: Size and Urban-Rural Classification") %>%
  
  cols_label(
    estado = "State",
    total_municipios = "Total Municipalities",
    
    # Municipality Size Categories
    municipios_less_25k = "Less than 25K",
    municipios_25k_50k = "25K - 50K",
    municipios_50k_100k = "50K - 100K",
    municipios_more_100k = "More than 100K",
    
    # Urban-Rural Classification (OECD)
    municipios_rural_ocde = "Rural (Low-density cluster)",
    municipios_urban_ocde = "Urban (Dense urban cluster)",
    
    # ONU Urbanization Categories
    municipios_city_onu = "City (OECD)",
    municipios_town_onu = "Town (OECD)",
    municipios_rural_onu = "Rural (OECD)"
  ) %>%
  
  fmt_number(
    columns = everything(),
    decimals = 0
  ) %>%
  
  tab_options(
    table.font.size = "medium",
    column_labels.font.weight = "bold",
    heading.align = "center"
  ) %>%
  
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(rows = estado == "Total")
  ) %>%
  
  tab_source_note(
    source_note = "Source: Instituto Nacional de Estadistica, 2011 census (last available census)."
  ) %>%
  tab_footnote(
    footnote = "OECD urban classification: Cities (>50,000 inhabitants, density >1,500/km²), Towns (>5,000 inhabitants, density >300/km²), Rural areas (low-density or uninhabited).",
    locations = cells_column_labels(columns = c(municipios_city_onu, municipios_town_onu, municipios_rural_onu))
  ) %>%
  tab_footnote(
    footnote = "Urban-rural typology: Urban (density >= 150/km²), Rural (density < 150/km²).",
    locations = cells_column_labels(columns = c(municipios_rural_ocde, municipios_urban_ocde))
  )

ven_census_mun_summary_table

names(ven_mun)

gtsave(ven_census_mun_summary_table, filename = "ven_census_mun_summary_table.docx")

names(ven)


### Evlucion de voto del chavismo


ven %>%
  group_by(year) %>%
  summarise(p_chavismo = weighted.mean(of_p, validos, na.rm = TRUE)*100) %>% 
  # Plot Chavismo support over time
  ggplot(., aes(x = year, y = p_chavismo, group = 1)) +
  geom_line(color = "red") +
  geom_point(color = "red") +
  geom_hline(yintercept = 50, linetype = "dashed", color = "black") +
  labs(title = "Evolution of Chavismo electoral share",
       x = "",
       y = "Vote (%)") +
  theme_minimal()

ggsave("year_evolution_of_of.svg")

ven %>%
  group_by(year) %>%
  summarise(p_oposicion = weighted.mean(op_p, validos, na.rm = TRUE)*100) %>% 
  # Plot Opposition support over time
  ggplot(., aes(x = year, y = p_oposicion, group = 1)) +
  geom_line(color = "blue") +
  geom_point(color = "blue") +
  geom_hline(yintercept = 50, linetype = "dashed", color = "black")+
  labs(title = "Evolution of opposition electoral share",
       x = "",
       y = "Vote (%)") +
  theme_minimal()

ggsave("year_evolution_of_op.svg")


ven %>%
  group_by(year) %>%
  summarise(p_others = weighted.mean(otro_p, validos, na.rm = TRUE)*100) %>% 
  # Plot Opposition support over time
  ggplot(., aes(x = year, y = p_others, group = 1)) +
  geom_line(color = "purple") +
  geom_point(color = "purple") +
  geom_hline(yintercept = 50, linetype = "dashed", color = "black")+
  labs(title = "Evolution of minoritarian parties electoral share",
       x = "",
       y = "Vote (%)") +
  theme_minimal()

ggsave("year_evolution_of_others.svg")


vote_share <- ven %>%
  group_by(year) %>%
  summarise(
    p_chavismo = weighted.mean(of_p, validos, na.rm = TRUE),
    p_oposicion = weighted.mean(op_p, validos, na.rm = TRUE),
    p_others = weighted.mean(otro_p, validos, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c(p_chavismo, p_oposicion,p_others), 
               names_to = "Political_Group", 
               values_to = "Support") %>% 
  # Plot both trends
  ggplot(., aes(x = year, y = Support, color = Political_Group, group = Political_Group)) +
  geom_line(size = 1) +
  geom_point() +
  geom_hline(yintercept = 50, linetype = "dashed", color = "black") +
  scale_color_manual(values = c("p_chavismo" = "red", "p_oposicion" = "blue", "p_others" = "purple"),
                     labels = c("Chavismo", "Opposition", "Others")) +
  labs(title = "Share of the vote by political bloc: Venezuela Presidential elections 2006-2024",
       x = "",
       y = "Vote (%)",
       color = "Political bloc") +
  theme_minimal() + theme(axis.title.y = element_text(vjust = 2, size = 9),
                          title = element_text(size = 10))
vote_share

ggsave("year_evolution_of_of_op_ot.jpg")



ven %>%
  group_by(year) %>%
  summarise(turnout = weighted.mean(turnout, rep_c, na.rm = TRUE)) %>% 
  # Plot Opposition support over time
  ggplot(., aes(x = year, y = turnout, group = 1)) +
  geom_line() +
  geom_point() +
  labs(title = "Evolution of turnout over time",
       x = "",
       y = "Vote (%)") +
  theme_minimal()

ggsave("yearly_turnout.svg")

